# load libraries
library(quanteda)
library(readtext)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
library(tidyverse)
library(tm)
library("textcat")
library("quanteda.textplots")
library("quanteda.textstats")
library("gsubfn")
library("spacyr")
spacy_initialize(model = "de_core_news_sm")
# load corpus files
full_corpus = readRDS("corpora/full_corpus.rds")
full_corpus_sents = readRDS("corpora/full_corpus_sents.rds")
pro_corpus = readRDS("corpora/pro_corpus.rds")
contra_corpus = readRDS("corpora/contra_corpus.rds")
pro2000 = readRDS("corpora/pro2000.rds")
pro900 = readRDS("corpora/pro900.rds")
contra2000 = readRDS("corpora/contra2000.rds")
contra900 = readRDS("corpora/contra900.rds")
fff_de_corpus = readRDS("corpora/fff_de_corpus.rds")
ikem_corpus = readRDS("corpora/ikem_corpus.rds")
klimarep_corpus = readRDS("corpora/klimarep_corpus.rds")
klimafakten_corpus = readRDS("corpora/klimafakten_corpus.rds")
zero_corpus = readRDS("corpora/zero_corpus.rds")
komma_corpus = readRDS("corpora/komma_corpus.rds")
eike_corpus = readRDS("corpora/eike_corpus.rds")
ffh_corpus = readRDS("corpora/ffh_corpus.rds")
# to get index number
#id_pro = 1:ndoc(pro900)
pro2000_sum <- summary(pro2000, ndoc(pro2000))
contra2000_sum <- summary(contra2000)
pro900_sum <- summary(pro900, ndoc(pro900))
contra900_sum <- summary(contra900, ndoc(contra900))
#to get id as x axis
#id_pro[1:100]
#contra900_sum$id
ggplot(pro2000_sum, aes(id, Sentences, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Sentences Pro2000")
ggplot(contra2000_sum, aes(id, Sentences, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Sentences Contra2000")
pro900_sum <- summary(pro900, n=100)
contra900_sum <- summary(contra900, n=100)
ggplot(pro900_sum, aes(pro900_sum$id, Tokens, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Tokens Pro900")
ggplot(contra900_sum, aes(contra900_sum$id, Tokens, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Tokens Contra900")
ggplot(pro900_sum, aes(pro900_sum$id, Types, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Types Pro900")
ggplot(contra900_sum, aes(contra900_sum$id, Types, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Types Contra900")
NA
NA
ggplot(pro900_sum, aes(Tokens, Types, group=1, label= id)) +
geom_smooth(method = "lm", formula ="y ~ x", se = FALSE) +
geom_text(check_overlap = T) +
ggtitle("Type-Token-Relation Pro900")
ggplot(contra900_sum, aes(Tokens, Types, group=1, label= id)) +
geom_smooth(method = "lm", formula ="y ~ x", se = FALSE) +
geom_text(check_overlap = T) +
ggtitle("Type-Token-Relation Contra900")
<<<<<<< Updated upstream # Word Frequencies
# stoplists
de_stopwords <- stopwords::stopwords("de", source="snowball")
en_stopwords <- stopwords::stopwords("en", source="snowball" )
custom_stopwords <- read.table("de_complete.txt", header=F, sep="\n")
# add own stopwords
full_stopwords <- c(de_stopwords, "dass", "=", "the", "seit", "ab", "beim", "\n", "mal", "c", "|", "m", "kommentare", "neueste", "gepostet", custom_stopwords, en_stopwords)
de_stopwords1 <- c(de_stopwords, "dass", "=", "the", "seit", "ab", "beim", "\n", "mal", "c", "\\|","|", "m", "kommentare", "neueste", "gepostet", "admin", "cookies", "inhalte", "inhalt", "newsletter", "posten", "zugriff", "passwort", "geschützt", "seite", "website", "webseite", "and", "0", "1", "2", "3","4","5","6","7","8","9", "mfg","w","t","wer")
# create dfm
dfm_p2000 <- dfm(pro2000, remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE)
Warnung: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
dfm_c2000 <- dfm(contra2000, remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE)
Warnung: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
# pro
sp_pro2000 <- spacy_parse(pro2000, pos=FALSE, entity=FALSE, dependency=FALSE)
Warnung in spacy_parse.character(pro2000, pos = FALSE, entity = FALSE, dependency = FALSE)
lemmatization may not work properly in model 'de_core_news_sm'
sp_pro2000$token <- sp_pro2000$lemma
sp_dfm_p2000 <- as.tokens(sp_pro2000)%>%
dfm(remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
# contra
sp_contra2000 <- spacy_parse(contra2000, pos=FALSE, entity=FALSE, dependency=FALSE)
Warnung in spacy_parse.character(contra2000, pos = FALSE, entity = FALSE,
lemmatization may not work properly in model 'de_core_news_sm'
sp_contra2000$token <- sp_contra2000$lemma
sp_dfm_c2000 <- as.tokens(sp_contra2000)%>%
dfm(remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
dfm_p2000
Document-feature matrix of: 2,000 documents, 60,243 features (99.72% sparse) and 4 docvars.
features
docs klima update ° folge extremwetter eu-klimaplan versicherungswende wütet weltweit eu-kommission
kr_00007.txt 3 3 3 1 3 1 1 2 2 1
ikem_01141.txt 0 0 0 0 0 0 0 0 0 0
fff_de_00121.txt 0 0 0 0 0 0 0 0 0 0
ikem_00898.txt 0 0 0 0 0 0 0 0 0 0
ikem_00709.txt 0 0 0 0 0 0 0 0 0 0
ikem_00588.txt 0 0 0 0 0 0 0 0 0 0
[ reached max_ndoc ... 1,994 more documents, reached max_nfeat ... 60,233 more features ]
dfm_c2000
Document-feature matrix of: 2,000 documents, 150,974 features (99.71% sparse) and 4 docvars.
features
docs interne ermittler schule glauben ehemalige labortechnikerin erin potts-kant daten medizinische
eike_05241.txt 1 1 2 1 2 1 1 6 7 1
eike_07670.txt 0 0 0 0 0 0 0 0 0 0
eike_12660.txt 0 0 0 0 0 0 0 0 0 0
eike_13073.txt 0 0 0 0 0 0 0 0 0 0
eike_09381.txt 0 0 0 0 0 0 0 0 0 0
eike_05930.txt 0 0 0 0 0 0 0 0 0 0
[ reached max_ndoc ... 1,994 more documents, reached max_nfeat ... 150,964 more features ]
topfeatures(sp_dfm_p2000, n=50)
mehr uhr ikem mensch jahr weit deutschland geben energie thema aktuell
4283 3497 2657 2464 2062 1698 1645 1602 1592 1402 1362
sollen groß gehen klimaschutz gut sowie immer bleiben information müssen schon
1323 1319 1301 1289 1276 1207 1156 1152 1151 1144 1084
dabei future arbeit stehen energiewende politik finden welch land projekt ziel
1020 992 977 926 901 895 891 886 876 869 863
wichtig berlin kommen fridays erfahren jed zukunft klimakrise newsletter ganz neu
851 846 838 831 825 816 813 796 780 769 750
rahmen laufende möglich erst anmelden frage
748 741 735 721 720 717
topfeatures(sp_dfm_c2000, n=50)
jahr geben mehr schon immer co2 gut kommen ja gehen weit deutschland
9197 6646 6328 4722 4289 4114 4067 4056 3951 3821 3786 3643
sollen mensch sagen groß welch jed zeigen energie hoch wenig ganz global
3640 3559 3448 3378 3332 3286 3102 2921 2883 2869 2834 2523
sehen müssen temperatur strom herr natürlich stehen einfach ° zeit klima erst
2503 2480 2423 2399 2381 2236 2234 2226 2221 2215 2196 2154
finden heute frage land klimawandel liegen etwa erwärmung genau wissen erde welt
2121 2064 2048 2001 1944 1912 1901 1856 1852 1852 1824 1822
atmosphäre tun
1808 1804
tf_p2000 <- topfeatures(sp_dfm_p2000, n=50)
tf_c2000 <- topfeatures(sp_dfm_c2000, n=50)
textstat_frequency(sp_dfm_p2000, n=50)
klima_p2000 <- dfm_select(sp_dfm_p2000, pattern="klima*")
klima_c2000 <- dfm_select(sp_dfm_c2000, pattern="klima*")
topfeatures(klima_p2000, n=50)
klimaschutz klimakrise klimapolitische
1289 796 705
klimawandel klima klimapolitik
671 638 277
klimagerechtigkeit klimaziele klimastreik
239 224 200
klimawandels klimaneutral klimaneutralität
170 150 128
klima- klimaabkommen klimafinanzierung
118 102 99
klimareporter klimaschutzmaßnahmen klimaschutzgesetz
94 94 91
klimaschutzziele klimakatastrophe klimanotstand
84 80 67
klimacamp klimapaket klimagerechte
65 63 58
klimakommunikation klimaschädlichen klimaschutzes
48 47 46
klimafreundliche klimafakten.de klimaschädliche
43 41 39
klimagerechtigkeitsbewegung klimabewegung klimaneutrale
38 38 38
klimaplan klimawissen klimakonferenzen
37 36 36
klimastreiks klimaschutzpolitik klimaforscher
36 36 35
klimaabkommens klimaerwärmung klimawahl
35 32 31
klimapolitischen klimaneutralen klimakonferenz
31 29 29
klimazielen klimaziel klimafreundlichen
29 27 26
klimaforschung klimagesetz
25 24
topfeatures(klima_c2000, n=50)
klima klimawandel klimawandels klimaschutz
2196 1944 492 443
klimamodelle klimawissenschaft klimaforscher klimapolitik
278 198 176 158
klimakatastrophe klimaerwärmung klimawissenschaftler klimasensitivität
157 149 135 123
klimamodellen klimaforschung klimaleugner klima-
115 108 105 94
klimakrise klimaänderungen klimaskeptiker klimaretter
87 83 82 78
klimahysterie klimatologie klimaabkommen klimaalarmisten
67 67 57 57
klimasystem klimatisch klimatologen klima-alarmisten
57 55 55 54
klimaaktivisten klimagipfel klimafolgenforschung klimaziele
50 50 46 46
klimarettung klimaneutral klimaretter.info klimaschwankungen
45 42 42 41
klimaänderung klimakonferenz klimatische klimaschützer
41 40 39 36
klimadebatte klimareligion klimadaten klimaflüchtlinge
35 35 35 33
klimaveränderungen klimaschutzes klimamodell klimaexperten
31 30 30 29
klimakirche klimasystems
29 29
freq_p2000 <- textstat_frequency(sp_dfm_p2000, n=50)
freq_c2000 <- textstat_frequency(sp_dfm_c2000, n=50)
plot_p2000 <- with(freq_p2000, reorder(feature, -frequency))
plot_c2000 <- with(freq_c2000, reorder(feature, -frequency))
#create plot for eike klima words frequencies
plot1 <- ggplot(freq_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot1, width = 10, height = 5, dpi=300, filename="klima_eike_plot.jpeg" )
plot1
#create plot for klimareporter klima words frequencies
plot2 <- ggplot(freq_c2000, aes(x=feature, y=frequency)) +
geom_point()+ ggtitle("C2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot2, width = 10, height = 5, dpi=300, filename="klima_klimarep_plot.jpeg" )
plot2
freq_klima_p2000 <- textstat_frequency(klima_p2000, n=50)
freq_klima_c2000 <- textstat_frequency(klima_c2000, n=50)
freq_klima_p2000$feature <- with(freq_klima_p2000, reorder(feature, -frequency))
freq_klima_c2000$feature <- with(freq_klima_c2000, reorder(feature, -frequency))
#create plot for eike klima words frequencies
plot1 <- ggplot(freq_klima_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 Klima Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot1, width = 10, height = 5, dpi=300, filename="klima_eike_plot.jpeg" )
plot1
#create plot for klimareporter klima words frequencies
plot2 <- ggplot(freq_klima_c2000, aes(x=feature, y=frequency)) +
geom_point()+ ggtitle("C2000 Klima Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot2, width = 10, height = 5, dpi=300, filename="klima_klimarep_plot.jpeg" )
plot2
# to save lists of klima-words
# w/o "$feature" it saves the whole table as text file (with frequency info etc.)
#capture.output(list(freq_klima_p2000$feature), file = "terms_pro.txt")
#capture.output(list(freq_klima_c2000$feature), file = "terms_contra.txt")
# weighted words
p2000_weight <- dfm_weight(sp_dfm_p2000, scheme="prop")
c2000_weight <- dfm_weight(sp_dfm_c2000, scheme="prop")
relfreq_p2000 <- textstat_frequency(p2000_weight, n=50)
relfreq_c2000 <- textstat_frequency(c2000_weight, n=50)
#tfidf
p2000_tfidf <- dfm_tfidf(sp_dfm_p2000)
c2000_tfidf <- dfm_tfidf(sp_dfm_c2000)
#plot3 <- with(relfreq_p2000, reorder(feature, -freqency))
relfreq_p2000$feature <- with(relfreq_p2000, reorder(feature, -frequency))
plot3 <- ggplot(relfreq_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot1, width = 10, height = 5, dpi=300, filename="klima_eike_plot.jpeg" )
plot3
pro_freq_tfidf <- p2000_tfidf %>%
textstat_frequency(n=10, force=TRUE)
con_freq_tfidf <- c2000_tfidf %>%
textstat_frequency(n=10, force=TRUE)
tplot_tfidf_p2000 <- ggplot(data=pro_freq_tfidf,
aes(x=factor(nrow(pro_freq_tfidf):1),
y=frequency)) +
geom_point() +
coord_flip() +
scale_x_discrete(breaks=factor(nrow(pro_freq_tfidf):1),
labels=pro_freq_tfidf$feature) +
labs(x=NULL, y="tf-idf")
tplot_tfidf_p2000
tplot_tfidf_c2000 <- ggplot(data=con_freq_tfidf,
aes(x=factor(nrow(con_freq_tfidf):1),
y=frequency)) +
geom_point() +
coord_flip() +
scale_x_discrete(breaks=factor(nrow(con_freq_tfidf):1),
labels=con_freq_tfidf$feature) +
labs(x=NULL, y="tf-idf")
tplot_tfidf_c2000
topfeatures(p2000_tfidf, n=20)
uhr infos| jahr cookies energie geben mehr weit gut sollen
3974.9600 1661.5255 1179.1861 1053.8321 1004.2084 964.5001 930.1357 920.5165 904.4967 869.9748
gehen deutschland schon immer groß 12:00 müssen sowie future klimaschutz
867.1851 864.9142 852.5468 824.5968 823.4997 818.0000 786.1433 774.9726 764.6637 759.5081
topfeatures(c2000_tfidf, n=20)
jahr co2 geben mehr ja ° strom schon temperatur deutschland
2420.699 2202.477 1859.821 1802.421 1683.002 1667.530 1630.969 1585.781 1569.662 1547.591
herr atmosphäre immer mensch gut w gehen zeigen kommen global
1458.625 1402.981 1394.521 1381.877 1361.991 1328.953 1312.218 1296.439 1287.389 1285.063
#p2000_weight
#textstat_frequency(p2000_tfidf, n=10)
# to remove special chars from corpus
pro2000 <- gsub("\\|", "", pro2000)
pro2000 <- gsub("=", "", pro2000)
# convert into collocation dataframe
p_coll <- textstat_collocations(pro2000, min_count=50)
arrange(p_coll, desc(count))
# transform stoplists for dplyr::filter function (why?)
de_stps <- paste0(de_stopwords, collapse = "\\b|\\b")
en_stps <- paste0(en_stopwords, collapse = "\\b|\\b")
# remove entries involving stopwords: NOT WORKING YET!
p2000_coll_clean <- p_coll %>%
dplyr::filter(!str_detect(collocation, de_stps),
!str_detect(collocation, en_stps),
!str_detect(collocation, "="))
#p2000_coll_clean
arrange(p2000_coll_clean, desc(count))
# collocations mit vorherigem stopwords removal
p2000_toks <- tokens(pro2000)
p2000_toks_sw <- tokens_select(p2000_toks, pattern=full_stopwords, selection="remove")
p2000_coll <- textstat_collocations(p2000_toks_sw, min_count=3)
arrange(p2000_coll, desc(count))
summary(full_corpus, n=5)
Corpus consisting of 4000 documents, showing 5 documents:
Text Types Tokens Sentences origin language group
kr_00007.txt 196 284 19 kr german activists
ikem_01141.txt 27 30 2 ikem german activists
fff_de_00121.txt 105 178 11 fff_de german activists
ikem_00898.txt 211 880 32 ikem german activists
ikem_00709.txt 20 43 3 ikem german activists
ggplot(freqs) +
geom_segment(aes(x=feature, xend=feature, y=frequency.x, yend=frequency.y), color="grey") +
geom_point(aes(x=feature, y=frequency.x), color = "red", size = 3 ) +
geom_point(aes(x=feature, y=frequency.y), color = "lightblue", size = 3 ) +
ggtitle("Word Frequencies") +
xlab("") + ylab("Wortfrequenz") +
coord_flip()
Warnung: Removed 1 rows containing missing values (geom_segment).
Warnung: Removed 1 rows containing missing values (geom_point).
p+labs(colour="Group")
Warnung: Removed 1 rows containing missing values (geom_segment).
Warnung: Removed 1 rows containing missing values (geom_point).
as.tokens(sp_pro2000) %>%
dfm()
Document-feature matrix of: 2,000 documents, 54,656 features (99.63% sparse) and 0 docvars.
features
docs klima update ° – folge 35 \n extremwetter , eu-klimaplan
kr_00007.txt 3 3 3 3 1 1 8 3 8 1
ikem_01141.txt 0 0 0 0 0 0 2 0 1 0
fff_de_00121.txt 0 0 0 0 0 0 6 0 9 0
ikem_00898.txt 0 0 0 0 0 0 24 0 28 0
ikem_00709.txt 0 0 0 0 0 0 12 0 0 0
ikem_00588.txt 0 0 0 3 0 0 45 0 42 0
[ reached max_ndoc ... 1,994 more documents, reached max_nfeat ... 54,646 more features ]
dfm_weight_corp <- full_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(de_stopwords1) %>%
dfm() %>%
dfm_weight(scheme = "prop")
# Calculate relative frequency by president
freq_weight <- textstat_frequency(dfm_weight_corp, n = 10,
groups = dfm_weight_corp$origin)
ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_continuous(breaks = nrow(freq_weight):1,
labels = freq_weight$feature) +
labs(x = NULL, y = "Relative frequency")
summary(full_corpus, n=10)
full_corpus %>%
tokens(remove_punct = TRUE, remove_numbers=TRUE) %>%
tokens_remove(de_stopwords1) %>%
dfm() %>%
dfm_group(groups = group) %>%
dfm_trim(min_termfreq = 5, verbose = FALSE) %>%
textplot_wordcloud(comparison = TRUE, max_words=100)
#,color=c("lightblue","blue"))
textplot_xray(
kwic(tokens(pro2000), pattern = "klima*"),
kwic(tokens(contra2000), pattern = "klima*"))
library(tidytext)
Fehler in library(tidytext) : es gibt kein Paket namens ‘tidytext’
dfm_full <- dfm(full_corpus, remove=de_stopwords1, remove_punct=TRUE, remove_numbers=TRUE)
dfm_full
tm_full<-convert(dfm_full, to="topicmodels")
topicModel <- LDA(tm_full, k=5, method="Gibbs", control=list(iter = 500, verbose = 25))
terms(topicModel, 10)
Ideas: DONE: Wordcloud Plot of Comparison Group: https://quanteda.io/articles/pkgdown/examples/plotting.html
Lexical Dispersion Plot (X-Ray) -> could be done for keywords "klima*"
Next: - Calculate “Corpus Similarity” - Klimawörter Liste mit Group und Counts abspeichern -> welche Klimawörter gibt es wo und wie oft - alles mit der “Sents” Version testen - Analyse der Ergebnisse - Literatur-Recherche zu den Textmining Themen
Topic Modeling https://www.tidytextmining.com/topicmodeling.html